#Data Cleaning-part1

library(dplyr)
library(tidyverse)
library(rsample)
wine <- read_csv(file="winemag-data-130k-v2.csv")
y <- gsub("[[:alpha:][:punct:]]","",wine$title)
wine1 <- wine%>%
  mutate(title1=as.numeric(y))
names(wine1)
##  [1] "X1"                    "country"              
##  [3] "description"           "designation"          
##  [5] "points"                "price"                
##  [7] "province"              "region_1"             
##  [9] "region_2"              "taster_name"          
## [11] "taster_twitter_handle" "title"                
## [13] "variety"               "winery"               
## [15] "title1"

 

 

wine %>%
  filter(variety == 'Pinot Noir' | variety == 'Chardonnay' | variety == 'Cabernet Sauvignon' | variety == 'Red Blend' | variety == 'Bordeaux-style Red Blend' | variety == 'Riesling' | variety == 'Sauvignon Blanc'| variety == 'Syrah' | variety == 'Rosé' | variety == 'Merlot' ) %>%
  filter(is.na(points) != TRUE) %>%
  group_by(variety) %>%
  summarize(avg_point_by_variety=mean(points)) %>%
  ggplot(.,aes(x=reorder(variety, avg_point_by_variety), avg_point_by_variety)) +
  geom_bar(stat="identity", fill="darkblue") +
  scale_y_continuous("Average Rating", breaks=seq(0,100, by=20))+
  ggtitle("Average Rating per Bottle",
          subtitle = "Top 10 Varieties") +
  coord_flip() +
  theme(panel.grid = element_blank(),
        panel.background = element_blank(),
        axis.title.y = element_blank())

#Correlation between Price and Rating of wine.

winefilter <- wine %>%
  filter(variety == 'Pinot Noir' | variety == 'Chardonnay' | variety == 'Cabernet Sauvignon' | variety == 'Red Blend' | variety == 'Bordeaux-style Red Blend' | variety == 'Riesling')

sample <- winefilter[sample(nrow(winefilter), 250), ]

ggplot(sample)+
  geom_point(mapping = aes(x = points, y = price, color = variety), alpha = 0.5)+
  geom_smooth(mapping = aes(x = points, y = price), se = FALSE) +
  theme_classic()+
  ylab('Price')+
  xlab('Ratings')+
  ylim(0,300)

 

 

#Animation of price and rating overtimes.(by different countires)

#This animation provides us two insight.The first one is that the aged wine is mainly coming from Italy, France and Germany, which represented by more greenish dots. The new wine is mostly coming from U.S. and Spain, which represented by more reddish color. Another insight is that the price range of aged wine is larger than new wine.

library(plotly)
library(ggplot2)
# Create the plot
wine1<- wine1%>%
  filter(title1>=1994&title1<=2017)
p <- ggplot(wine1, aes(x=points, y=price, color=country))+
  geom_point(aes(frame=title1),alpha=0.6)+
  ggtitle("Price vs Rating Over Times",
           subtitle="Wine Vintage Throughout the World")+
  xlim(75,100)+
  ylim(0,400)+
  theme_classic()+
  labs(x="Rating", y = 'Price', color="Country")
p <- p %>%
  animation_slider(
    currentvalue = list(prefix = "YEAR ", font = list(color="red")))
wine2 <- ggplotly(p, height = 600, width = 500) %>%
  animation_opts(
                 easing = "linear",
                 redraw =FALSE)
wine2